Setup

library(tidyverse)
library(magrittr)
library(parallel)
library(pander)
library(here)
library(scales)
library(ggpubr)
library(kableExtra)
library(edgeR)
library(DT)
library(ggrepel)
library(pheatmap)
library(ggdendro)
if (interactive()) setwd(here::here())
theme_set(theme_bw())
cores <- detectCores() - 2
load(
    here::here("1_Psen2S4Ter/R/output/1_1_DE.RData")
)

k-mer analysis

jellyfish v2.3.0 was used to count kmers of trimmed fastq files that had been filtered for rRNA sequences. This was performed for 5 values: \(k = 5, 6, 7, 8, 9, 10\). Lower values of \(k\) lose specificity in comparison to higher values, however as \(k\) increases, the exponential increase of possible kmers causes limitations due to computational processing time.

k = 5

Counts

k5files <- list.files(
    "/mnt/hpcfs/a1647910/20200310_rRNADepletion/1_Psen2S4Ter/analysis-kmer/results/jellyfish/k5",
    pattern = ".dumps", full.names = TRUE
)
k5counts <- lapply(k5files, function(x){
    read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
        set_colnames(str_remove_all(colnames(.), "_6month_.+_F3|[0-9]*_Ps2Ex3M1_|\\.dumps"))
}) %>%
    purrr::reduce(full_join) %>%
    dplyr::select(mer, contains(c("WT", "Heter")))
k5dge <- k5counts %>%
    as.data.frame() %>%
    column_to_rownames("mer") %>%
    DGEList() %>%
    calcNormFactors()
k5dge$samples %<>%
    rownames_to_column("rowname") %>%
    mutate(sample = rowname) %>%
    left_join(addInfo) %>%
    column_to_rownames("rowname")
k5dge$samples$group <- colnames(k5dge) %>%
    str_extract("(WT|Heter)") %>%
    factor(levels = c("WT", "Heter"))

Properties

k5dist <- k5dge %>%
    cpm(log = TRUE) %>%
    as.data.frame() %>%
    pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
    ggplot(aes(x=count, colour = sample)) +
    geom_density() +
    labs(x = "intensity", title = "Distribution of 5-mers")
k5labels <- k5dge$samples %>% 
    mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>% 
    .$label
k5heat <- k5dge %>% 
    cpm(log = TRUE) %>%
    as.data.frame() %>%
    t() %>%
    pheatmap(silent = TRUE, cluster_cols = FALSE,
             show_colnames = FALSE, fontsize = 9,
             fontsize_row = 10, border_color = NA,
             main = "5-mer counts heatmap", labels_row = k5labels)
k5heat$tree_row$labels <- k5labels
k5den <- ggdendrogram(k5heat$tree_row, rotate = TRUE) +
    labs(title = "Hierarchical clustering of 5-mer counts") +
    theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k5pca <- k5dge %>%
    cpm(log = TRUE) %>%
    t() %>%
    prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k5pca)$importance %>% pander(split.tables = Inf)
k5pcaPlot <- k5pca$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, PC1, PC2) %>%
    left_join(k5dge$samples) %>%
    ggplot(aes(PC1, PC2, colour = group, label = rRNA)) +
    geom_point(alpha = 0.8, size = 3) +
    geom_text_repel(show.legend = FALSE) +
    labs(
        x = paste0("PC1 (", percent(summary(k5pca)$importance[2, "PC1"]), ")"),
        y = paste0("PC2 (", percent(summary(k5pca)$importance[2, "PC2"]), ")"),
        colour = "Genotype",
        title = "k = 5"
    ) +
    scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k5pcaRrna <- k5pca$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, PC1, PC2) %>%
    left_join(k5dge$samples) %>%
    ggplot(aes(PC1, rRNA, label = rRNA)) +
    geom_point(aes(colour = group), alpha = 0.8, size = 3) +
    geom_text_repel(show.legend = FALSE) +
    geom_smooth(method = "lm") +
    labs(
        x = paste0("PC1 (", percent(summary(k5pca)$importance[2, "PC1"]), ")"),
        y = "rRNA proportion",
        colour = "Genotype",
        title = "k = 5"
    ) +
    scale_y_continuous(labels = percent) +
    scale_colour_discrete(labels = c("Wildtype", "Mutant"))

Differential expression

k5design <- model.matrix(~rRNA, data = k5dge$samples)
k5voom <- voom(k5dge, design = k5design)
k5fit <- lmFit(k5voom, design = k5design)
k5eBayes <- eBayes(k5fit)
k5topTable <- k5eBayes %>%
    topTable(coef = colnames(k5design)[2], sort.by = "p", n = Inf) %>%
    set_colnames(str_remove(colnames(.), "ID\\.")) %>%
    rownames_to_column("mer") %>%
    mutate(BY = p.adjust(P.Value, "BY")) %>%
    mutate(DE = BY < 0.05) %>%
    dplyr::select(
        mer,
        AveExpr,
        logFC,
        P.Value,
        FDR = adj.P.Val,
        BY,
        t,
        DE,
        everything(),
        -B
    ) %>%
    as_tibble()

k = 6

Counts

k6files <- list.files(
    "/mnt/hpcfs/a1647910/20200310_rRNADepletion/1_Psen2S4Ter/analysis-kmer/results/jellyfish/k6",
    pattern = ".dumps", full.names = TRUE
)
k6counts <- lapply(k6files, function(x){
    read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
        set_colnames(str_remove_all(colnames(.), "_6month_.+_F3|[0-9]*_Ps2Ex3M1_|\\.dumps"))
}) %>%
    purrr::reduce(full_join) %>%
    dplyr::select(mer, contains(c("WT", "Heter")))
k6dge <- k6counts %>%
    as.data.frame() %>%
    column_to_rownames("mer") %>%
    DGEList() %>%
    calcNormFactors()
k6dge$samples %<>%
    rownames_to_column("rowname") %>%
    mutate(sample = rowname) %>%
    left_join(addInfo) %>%
    column_to_rownames("rowname")
k6dge$samples$group <- colnames(k6dge) %>%
    str_extract("(WT|Heter)") %>%
    factor(levels = c("WT", "Heter"))

Properties

k6dist <- k6dge %>%
    cpm(log = TRUE) %>%
    as.data.frame() %>%
    pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
    ggplot(aes(x=count, colour = sample)) +
    geom_density() +
    labs(x = "intensity", title = "Distribution of 6-mers")
k6labels <- k6dge$samples %>% 
    mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>% 
    .$label
k6heat <- k6dge %>% 
    cpm(log = TRUE) %>%
    as.data.frame() %>%
    t() %>%
    pheatmap(silent = TRUE, cluster_cols = FALSE,
             show_colnames = FALSE, fontsize = 9,
             fontsize_row = 10, border_color = NA,
             main = "6-mer counts heatmap", labels_row = k6labels)
k6heat$tree_row$labels <- k6labels
k6den <- ggdendrogram(k6heat$tree_row, rotate = TRUE) +
    labs(title = "Hierarchical clustering of 6-mer counts") +
    theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k6pca <- k6dge %>%
    cpm(log = TRUE) %>%
    t() %>%
    prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k6pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k6pcaPlot <- k6pca$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, PC1, PC2) %>%
    left_join(k6dge$samples) %>%
    ggplot(aes(PC1, PC2, colour = group, label = rRNA)) +
    geom_point(alpha = 0.8, size = 3) +
    geom_text_repel(show.legend = FALSE) +
    labs(
        x = paste0("PC1 (", percent(summary(k6pca)$importance[2, "PC1"]), ")"),
        y = paste0("PC2 (", percent(summary(k6pca)$importance[2, "PC2"]), ")"),
        colour = "Genotype",
        title = "k = 6"
    ) +
    scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k6pcaRrna <- k6pca$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, PC1, PC2) %>%
    left_join(k6dge$samples) %>%
    ggplot(aes(PC1, rRNA, label = rRNA)) +
    geom_point(aes(colour = group), alpha = 0.8, size = 3) +
    geom_text_repel(show.legend = FALSE) +
    geom_smooth(method = "lm") +
    labs(
        x = paste0("PC1 (", percent(summary(k6pca)$importance[2, "PC1"]), ")"),
        y = "rRNA proportion",
        colour = "Genotype",
        title = "k = 6"
    ) +
    scale_y_continuous(labels = percent) +
    scale_colour_discrete(labels = c("Wildtype", "Mutant"))

Differential expression

k6design <- model.matrix(~rRNA, data = k6dge$samples)
k6voom <- voom(k6dge, design = k6design)
k6fit <- lmFit(k6voom, design = k6design)
k6eBayes <- eBayes(k6fit)
k6topTable <- k6eBayes %>%
    topTable(coef = colnames(k6design)[2], sort.by = "p", n = Inf) %>%
    set_colnames(str_remove(colnames(.), "ID\\.")) %>%
    rownames_to_column("mer") %>%
    mutate(BY = p.adjust(P.Value, "BY")) %>%
    mutate(DE = BY < 0.05) %>%
    dplyr::select(
        mer,
        AveExpr,
        logFC,
        P.Value,
        FDR = adj.P.Val,
        BY,
        t,
        DE,
        everything(),
        -B
    ) %>%
    as_tibble()

k = 7

Counts

k7files <- list.files(
    "/mnt/hpcfs/a1647910/20200310_rRNADepletion/1_Psen2S4Ter/analysis-kmer/results/jellyfish/k7",
    pattern = ".dumps", full.names = TRUE
)
k7counts <- lapply(k7files, function(x){
    read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
        set_colnames(str_remove_all(colnames(.), "_6month_.+_F3|[0-9]*_Ps2Ex3M1_|\\.dumps"))
}) %>%
    purrr::reduce(full_join) %>%
    dplyr::select(mer, contains(c("WT", "Heter")))
k7dge <- k7counts %>%
    as.data.frame() %>%
    column_to_rownames("mer") %>%
    DGEList() %>%
    calcNormFactors()
k7dge$samples %<>%
    rownames_to_column("rowname") %>%
    mutate(sample = rowname) %>%
    left_join(addInfo) %>%
    column_to_rownames("rowname")
k7dge$samples$group <- colnames(k7dge) %>%
    str_extract("(WT|Heter)") %>%
    factor(levels = c("WT", "Heter"))

Properties

k7dist <- k7dge %>%
    cpm(log = TRUE) %>%
    as.data.frame() %>%
    pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
    ggplot(aes(x=count, colour = sample)) +
    geom_density() +
    labs(x = "intensity", title = "Distribution of 7-mers")
k7labels <- k7dge$samples %>% 
    mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>% 
    .$label
k7heat <- k7dge %>% 
    cpm(log = TRUE) %>%
    as.data.frame() %>%
    t() %>%
    pheatmap(silent = TRUE, cluster_cols = FALSE,
             show_colnames = FALSE, fontsize = 9,
             fontsize_row = 10, border_color = NA,
             main = "7-mer counts heatmap", labels_row = k7labels)
k7heat$tree_row$labels <- k7labels
k7den <- ggdendrogram(k7heat$tree_row, rotate = TRUE) +
    labs(title = "Hierarchical clustering of 7-mer counts") +
    theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k7pca <- k7dge %>%
    cpm(log = TRUE) %>%
    t() %>%
    prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k7pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k7pcaPlot <- k7pca$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, PC1, PC2) %>%
    left_join(k7dge$samples) %>%
    ggplot(aes(PC1, PC2, colour = group, label = rRNA)) +
    geom_point(alpha = 0.8, size = 3) +
    geom_text_repel(show.legend = FALSE) +
    labs(
        x = paste0("PC1 (", percent(summary(k7pca)$importance[2, "PC1"]), ")"),
        y = paste0("PC2 (", percent(summary(k7pca)$importance[2, "PC2"]), ")"),
        colour = "Genotype",
        title = "k = 7"
    ) +
    scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k7pcaRrna <- k7pca$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, PC1, PC2) %>%
    left_join(k7dge$samples) %>%
    ggplot(aes(PC1, rRNA, label = rRNA)) +
    geom_point(aes(colour = group), alpha = 0.8, size = 3) +
    geom_text_repel(show.legend = FALSE) +
    geom_smooth(method = "lm") +
    labs(
        x = paste0("PC1 (", percent(summary(k7pca)$importance[2, "PC1"]), ")"),
        y = "rRNA proportion",
        colour = "Genotype",
        title = "k = 7"
    ) +
    scale_y_continuous(labels = percent) +
    scale_colour_discrete(labels = c("Wildtype", "Mutant"))

Differential expression

k7design <- model.matrix(~rRNA, data = k7dge$samples)
k7voom <- voom(k7dge, design = k7design)
k7fit <- lmFit(k7voom, design = k7design)
k7eBayes <- eBayes(k7fit)
k7topTable <- k7eBayes %>%
    topTable(coef = colnames(k7design)[2], sort.by = "p", n = Inf) %>%
    set_colnames(str_remove(colnames(.), "ID\\.")) %>%
    rownames_to_column("mer") %>%
    mutate(BY = p.adjust(P.Value, "BY")) %>%
    mutate(DE = BY < 0.05) %>%
    dplyr::select(
        mer,
        AveExpr,
        logFC,
        P.Value,
        FDR = adj.P.Val,
        BY,
        t,
        DE,
        everything(),
        -B
    ) %>%
    as_tibble()

k = 8

Counts

k8files <- list.files(
    "/mnt/hpcfs/a1647910/20200310_rRNADepletion/1_Psen2S4Ter/analysis-kmer/results/jellyfish/k8",
    pattern = ".dumps", full.names = TRUE
)
k8counts <- lapply(k8files, function(x){
    read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
        set_colnames(str_remove_all(colnames(.), "_6month_.+_F3|[0-9]*_Ps2Ex3M1_|\\.dumps"))
}) %>%
    purrr::reduce(full_join) %>%
    dplyr::select(mer, contains(c("WT", "Heter")))
k8dge <- k8counts %>%
    as.data.frame() %>%
    column_to_rownames("mer") %>%
    DGEList() %>%
    calcNormFactors()
k8dge$samples %<>%
    rownames_to_column("rowname") %>%
    mutate(sample = rowname) %>%
    left_join(addInfo) %>%
    column_to_rownames("rowname")
k8dge$samples$group <- colnames(k8dge) %>%
    str_extract("(WT|Heter)") %>%
    factor(levels = c("WT", "Heter"))

Properties

k8dist <- k8dge %>%
    cpm(log = TRUE) %>%
    as.data.frame() %>%
    pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
    ggplot(aes(x=count, colour = sample)) +
    geom_density() +
    labs(x = "intensity", title = "Distribution of 8-mers")
k8labels <- k8dge$samples %>% 
    mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>% 
    .$label
k8heat <- k8dge %>% 
    cpm(log = TRUE) %>%
    as.data.frame() %>%
    t() %>%
    pheatmap(silent = TRUE, cluster_cols = FALSE,
             show_colnames = FALSE, fontsize = 9,
             fontsize_row = 10, border_color = NA,
             main = "8-mer counts heatmap", labels_row = k8labels)
k8heat$tree_row$labels <- k8labels
k8den <- ggdendrogram(k8heat$tree_row, rotate = TRUE) +
    labs(title = "Hierarchical clustering of 8-mer counts") +
    theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k8pca <- k8dge %>%
    cpm(log = TRUE) %>%
    t() %>%
    prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k8pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k8pcaPlot <- k8pca$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, PC1, PC2) %>%
    left_join(k8dge$samples) %>%
    ggplot(aes(PC1, PC2, colour = group, label = rRNA)) +
    geom_point(alpha = 0.8, size = 3) +
    geom_text_repel(show.legend = FALSE) +
    labs(
        x = paste0("PC1 (", percent(summary(k8pca)$importance[2, "PC1"]), ")"),
        y = paste0("PC2 (", percent(summary(k8pca)$importance[2, "PC2"]), ")"),
        colour = "Genotype",
        title = "k = 8"
    ) +
    scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k8pcaRrna <- k8pca$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, PC1, PC2) %>%
    left_join(k8dge$samples) %>%
    ggplot(aes(PC1, rRNA, label = rRNA)) +
    geom_point(aes(colour = group), alpha = 0.8, size = 3) +
    geom_text_repel(show.legend = FALSE) +
    geom_smooth(method = "lm") +
    labs(
        x = paste0("PC1 (", percent(summary(k8pca)$importance[2, "PC1"]), ")"),
        y = "rRNA proportion",
        colour = "Genotype",
        title = "k = 8"
    ) +
    scale_y_continuous(labels = percent) +
    scale_colour_discrete(labels = c("Wildtype", "Mutant"))

Differential expression

k8design <- model.matrix(~rRNA, data = k8dge$samples)
k8voom <- voom(k8dge, design = k8design)
k8fit <- lmFit(k8voom, design = k8design)
k8eBayes <- eBayes(k8fit)
k8topTable <- k8eBayes %>%
    topTable(coef = colnames(k8design)[2], sort.by = "p", n = Inf) %>%
    set_colnames(str_remove(colnames(.), "ID\\.")) %>%
    rownames_to_column("mer") %>%
    mutate(BY = p.adjust(P.Value, "BY")) %>%
    mutate(DE = BY < 0.05) %>%
    dplyr::select(
        mer,
        AveExpr,
        logFC,
        P.Value,
        FDR = adj.P.Val,
        BY,
        t,
        DE,
        everything(),
        -B
    ) %>%
    as_tibble()

k = 9

Counts

k9files <- list.files(
    "/mnt/hpcfs/a1647910/20200310_rRNADepletion/1_Psen2S4Ter/analysis-kmer/results/jellyfish/k9",
    pattern = ".dumps", full.names = TRUE
)
k9counts <- lapply(k9files, function(x){
    read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
        set_colnames(str_remove_all(colnames(.), "_6month_.+_F3|[0-9]*_Ps2Ex3M1_|\\.dumps"))
}) %>%
    purrr::reduce(full_join) %>%
    dplyr::select(mer, contains(c("WT", "Heter")))
k9dge <- k9counts %>%
    as.data.frame() %>%
    column_to_rownames("mer") %>%
    DGEList() %>%
    calcNormFactors()
k9dge$samples %<>%
    rownames_to_column("rowname") %>%
    mutate(sample = rowname) %>%
    left_join(addInfo) %>%
    column_to_rownames("rowname")
k9dge$samples$group <- colnames(k9dge) %>%
    str_extract("(WT|Heter)") %>%
    factor(levels = c("WT", "Heter"))

Properties

k9dist <- k9dge %>%
    cpm(log = TRUE) %>%
    as.data.frame() %>%
    pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
    ggplot(aes(x=count, colour = sample)) +
    geom_density() +
    labs(x = "intensity", title = "Distribution of 9-mers")
k9labels <- k9dge$samples %>% 
    mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>% 
    .$label
k9heat <- k9dge %>% 
    cpm(log = TRUE) %>%
    as.data.frame() %>%
    t() %>%
    pheatmap(silent = TRUE, cluster_cols = FALSE,
             show_colnames = FALSE, fontsize = 9,
             fontsize_row = 10, border_color = NA,
             main = "9-mer counts heatmap", labels_row = k9labels)
k9heat$tree_row$labels <- k9labels
k9den <- ggdendrogram(k9heat$tree_row, rotate = TRUE) +
    labs(title = "Hierarchical clustering of 9-mer counts") +
    theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k9pca <- k9dge %>%
    cpm(log = TRUE) %>%
    t() %>%
    prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k9pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k9pcaPlot <- k9pca$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, PC1, PC2) %>%
    left_join(k9dge$samples) %>%
    ggplot(aes(PC1, PC2, colour = group, label = rRNA)) +
    geom_point(alpha = 0.8, size = 3) +
    geom_text_repel(show.legend = FALSE) +
    labs(
        x = paste0("PC1 (", percent(summary(k9pca)$importance[2, "PC1"]), ")"),
        y = paste0("PC2 (", percent(summary(k9pca)$importance[2, "PC2"]), ")"),
        colour = "Genotype",
        title = "k = 9"
    ) +
    scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k9pcaRrna <- k9pca$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, PC1, PC2) %>%
    left_join(k9dge$samples) %>%
    ggplot(aes(PC1, rRNA, label = rRNA)) +
    geom_point(aes(colour = group), alpha = 0.8, size = 3) +
    geom_text_repel(show.legend = FALSE) +
    geom_smooth(method = "lm") +
    labs(
        x = paste0("PC1 (", percent(summary(k9pca)$importance[2, "PC1"]), ")"),
        y = "rRNA proportion",
        colour = "Genotype",
        title = "k = 9"
    ) +
    scale_y_continuous(labels = percent) +
    scale_colour_discrete(labels = c("Wildtype", "Mutant"))

Differential expression

k9design <- model.matrix(~rRNA, data = k9dge$samples)
k9voom <- voom(k9dge, design = k9design)
k9fit <- lmFit(k9voom, design = k9design)
k9eBayes <- eBayes(k9fit)
k9topTable <- k9eBayes %>%
    topTable(coef = colnames(k9design)[2], sort.by = "p", n = Inf) %>%
    set_colnames(str_remove(colnames(.), "ID\\.")) %>%
    rownames_to_column("mer") %>%
    mutate(BY = p.adjust(P.Value, "BY")) %>%
    mutate(DE = BY < 0.05) %>%
    dplyr::select(
        mer,
        AveExpr,
        logFC,
        P.Value,
        FDR = adj.P.Val,
        BY,
        t,
        DE,
        everything(),
        -B
    ) %>%
    as_tibble()

k = 10

Counts

k10files <- list.files(
    "/mnt/hpcfs/a1647910/20200310_rRNADepletion/1_Psen2S4Ter/analysis-kmer/results/jellyfish/k10",
    pattern = ".dumps", full.names = TRUE
)
k10counts <- lapply(k10files, function(x){
    read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
        set_colnames(str_remove_all(colnames(.), "_6month_.+_F3|[0-9]*_Ps2Ex3M1_|\\.dumps"))
}) %>%
    purrr::reduce(full_join) %>%
    dplyr::select(mer, contains(c("WT", "Heter")))
k10dge <- k10counts %>%
    as.data.frame() %>%
    column_to_rownames("mer") %>%
    DGEList() %>%
    calcNormFactors()
k10dge$samples %<>%
    rownames_to_column("rowname") %>%
    mutate(sample = rowname) %>%
    left_join(addInfo) %>%
    column_to_rownames("rowname")
k10dge$samples$group <- colnames(k10dge) %>%
    str_extract("(WT|Heter)") %>%
    factor(levels = c("WT", "Heter"))

Properties

k10dist <- k10dge %>%
    cpm(log = TRUE) %>%
    as.data.frame() %>%
    pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
    ggplot(aes(x=count, colour = sample)) +
    geom_density() +
    labs(x = "intensity", title = "Distribution of 10-mers")
k10labels <- k10dge$samples %>% 
    mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>% 
    .$label
k10heat <- k10dge %>% 
    cpm(log = TRUE) %>%
    as.data.frame() %>%
    t() %>%
    pheatmap(silent = TRUE, cluster_cols = FALSE,
             show_colnames = FALSE, fontsize = 9,
             fontsize_row = 10, border_color = NA,
             main = "10-mer counts heatmap", labels_row = k10labels)
k10heat$tree_row$labels <- k10labels
k10den <- ggdendrogram(k10heat$tree_row, rotate = TRUE) +
    labs(title = "Hierarchical clustering of 10-mer counts") +
    theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k10pca <- k10dge %>%
    cpm(log = TRUE) %>%
    t() %>%
    prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k10pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k10pcaPlot <- k10pca$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, PC1, PC2) %>%
    left_join(k10dge$samples) %>%
    ggplot(aes(PC1, PC2, colour = group, label = rRNA)) +
    geom_point(alpha = 0.8, size = 3) +
    geom_text_repel(show.legend = FALSE) +
    labs(
        x = paste0("PC1 (", percent(summary(k10pca)$importance[2, "PC1"]), ")"),
        y = paste0("PC2 (", percent(summary(k10pca)$importance[2, "PC2"]), ")"),
        colour = "Genotype",
        title = "k = 10"
    ) +
    scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k10pcaRrna <- k10pca$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, PC1, PC2) %>%
    left_join(k10dge$samples) %>%
    ggplot(aes(PC1, rRNA, label = rRNA)) +
    geom_point(aes(colour = group), alpha = 0.8, size = 3) +
    geom_text_repel(show.legend = FALSE) +
    geom_smooth(method = "lm") +
    labs(
        x = paste0("PC1 (", percent(summary(k10pca)$importance[2, "PC1"]), ")"),
        y = "rRNA proportion",
        colour = "Genotype",
        title = "k = 10"
    ) +
    scale_y_continuous(labels = percent) +
    scale_colour_discrete(labels = c("Wildtype", "Mutant"))

Differential expression

k10design <- model.matrix(~rRNA, data = k10dge$samples)
k10voom <- voom(k10dge, design = k10design)
k10fit <- lmFit(k10voom, design = k10design)
k10eBayes <- eBayes(k10fit)
k10topTable <- k10eBayes %>%
    topTable(coef = colnames(k10design)[2], sort.by = "p", n = Inf) %>%
    set_colnames(str_remove(colnames(.), "ID\\.")) %>%
    rownames_to_column("mer") %>%
    mutate(BY = p.adjust(P.Value, "BY")) %>%
    mutate(DE = BY < 0.05) %>%
    dplyr::select(
        mer,
        AveExpr,
        logFC,
        P.Value,
        FDR = adj.P.Val,
        BY,
        t,
        DE,
        everything(),
        -B
    ) %>%
    as_tibble()

k = 11

Counts

k11files <- list.files(
    "/mnt/hpcfs/a1647910/20200310_rRNADepletion/1_Psen2S4Ter/analysis-kmer/results/jellyfish/k11",
    pattern = ".dumps", full.names = TRUE
)
k11counts <- lapply(k11files, function(x){
    read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
        set_colnames(str_remove_all(colnames(.), "_6month_.+_F3|[0-9]*_Ps2Ex3M1_|\\.dumps"))
}) %>%
    purrr::reduce(full_join) %>%
    dplyr::select(mer, contains(c("WT", "Heter")))
k11dge <- k11counts %>%
    as.data.frame() %>%
    column_to_rownames("mer") %>%
    replace(is.na(.), 0) %>%
    DGEList() %>%
    calcNormFactors()
k11dge$samples %<>%
    rownames_to_column("rowname") %>%
    mutate(sample = rowname) %>%
    left_join(addInfo) %>%
    column_to_rownames("rowname")
k11dge$samples$group <- colnames(k11dge) %>%
    str_extract("(WT|Heter)") %>%
    factor(levels = c("WT", "Heter"))

Properties

k11dist <- k11dge %>%
    cpm(log = TRUE) %>%
    as.data.frame() %>%
    pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
    ggplot(aes(x=count, colour = sample)) +
    geom_density() +
    labs(x = "intensity", title = "Distribution of 11-mers")
k11labels <- k11dge$samples %>% 
    mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>% 
    .$label
k11heat <- k11dge %>% 
    cpm(log = TRUE) %>%
    as.data.frame() %>%
    t() %>%
    pheatmap(silent = TRUE, cluster_cols = FALSE,
             show_colnames = FALSE, fontsize = 9,
             fontsize_row = 10, border_color = NA,
             main = "11-mer counts heatmap", labels_row = k11labels)
k11heat$tree_row$labels <- k11labels
k11den <- ggdendrogram(k11heat$tree_row, rotate = TRUE) +
    labs(title = "Hierarchical clustering of 11-mer counts") +
    theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k11pca <- k11dge %>%
    cpm(log = TRUE) %>%
    t() %>%
    prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k11pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k11pcaPlot <- k11pca$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, PC1, PC2) %>%
    left_join(k11dge$samples) %>%
    ggplot(aes(PC1, PC2, colour = group, label = rRNA)) +
    geom_point(alpha = 0.8, size = 3) +
    geom_text_repel(show.legend = FALSE) +
    labs(
        x = paste0("PC1 (", percent(summary(k11pca)$importance[2, "PC1"]), ")"),
        y = paste0("PC2 (", percent(summary(k11pca)$importance[2, "PC2"]), ")"),
        colour = "Genotype",
        title = "k = 11"
    ) +
    scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k11pcaRrna <- k11pca$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, PC1, PC2) %>%
    left_join(k11dge$samples) %>%
    ggplot(aes(PC1, rRNA, label = rRNA)) +
    geom_point(aes(colour = group), alpha = 0.8, size = 3) +
    geom_text_repel(show.legend = FALSE) +
    geom_smooth(method = "lm") +
    labs(
        x = paste0("PC1 (", percent(summary(k11pca)$importance[2, "PC1"]), ")"),
        y = "rRNA proportion",
        colour = "Genotype",
        title = "k = 11"
    ) +
    scale_y_continuous(labels = percent) +
    scale_colour_discrete(labels = c("Wildtype", "Mutant"))

Differential expression

k11design <- model.matrix(~rRNA, data = k11dge$samples)
k11voom <- voom(k11dge, design = k11design)
k11fit <- lmFit(k11voom, design = k11design)
k11eBayes <- eBayes(k11fit)
k11topTable <- k11eBayes %>%
    topTable(coef = colnames(k11design)[2], sort.by = "p", n = Inf) %>%
    set_colnames(str_remove(colnames(.), "ID\\.")) %>%
    rownames_to_column("mer") %>%
    mutate(BY = p.adjust(P.Value, "BY")) %>%
    mutate(DE = BY < 0.05) %>%
    dplyr::select(
        mer,
        AveExpr,
        logFC,
        P.Value,
        FDR = adj.P.Val,
        BY,
        t,
        DE,
        everything(),
        -B
    ) %>%
    as_tibble()

k = 12

Counts

k12files <- list.files(
    "/mnt/hpcfs/a1647910/20200310_rRNADepletion/1_Psen2S4Ter/analysis-kmer/results/jellyfish/k12",
    pattern = ".dumps", full.names = TRUE
)
k12counts <- lapply(k12files, function(x){
    read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
        set_colnames(str_remove_all(colnames(.), "_6month_.+_F3|[0-9]*_Ps2Ex3M1_|\\.dumps"))
}) %>%
    purrr::reduce(full_join) %>%
    dplyr::select(mer, contains(c("WT", "Heter")))
k12dge <- k12counts %>%
    as.data.frame() %>%
    column_to_rownames("mer") %>%
    replace(is.na(.), 0) %>%
    DGEList() %>%
    calcNormFactors()
k12dge$samples %<>%
    rownames_to_column("rowname") %>%
    mutate(sample = rowname) %>%
    left_join(addInfo) %>%
    column_to_rownames("rowname")
k12dge$samples$group <- colnames(k12dge) %>%
    str_extract("(WT|Heter)") %>%
    factor(levels = c("WT", "Heter"))

Properties

k12dist <- k12dge %>%
    cpm(log = TRUE) %>%
    as.data.frame() %>%
    pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
    ggplot(aes(x=count, colour = sample)) +
    geom_density() +
    labs(x = "intensity", title = "Distribution of 12-mers")
k12labels <- k12dge$samples %>% 
    mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>% 
    .$label
k12heat <- k12dge %>% 
    cpm(log = TRUE) %>%
    as.data.frame() %>%
    t() %>%
    pheatmap(silent = TRUE, cluster_cols = FALSE,
             show_colnames = FALSE, fontsize = 9,
             fontsize_row = 10, border_color = NA,
             main = "12-mer counts heatmap", labels_row = k12labels)
k12heat$tree_row$labels <- k12labels
k12den <- ggdendrogram(k12heat$tree_row, rotate = TRUE) +
    labs(title = "Hierarchical clustering of 12-mer counts") +
    theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k12pca <- k12dge %>%
    cpm(log = TRUE) %>%
    t() %>%
    prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k12pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k12pcaPlot <- k12pca$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, PC1, PC2) %>%
    left_join(k12dge$samples) %>%
    ggplot(aes(PC1, PC2, colour = group, label = rRNA)) +
    geom_point(alpha = 0.8, size = 3) +
    geom_text_repel(show.legend = FALSE) +
    labs(
        x = paste0("PC1 (", percent(summary(k12pca)$importance[2, "PC1"]), ")"),
        y = paste0("PC2 (", percent(summary(k12pca)$importance[2, "PC2"]), ")"),
        colour = "Genotype",
        title = "k = 12"
    ) +
    scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k12pcaRrna <- k12pca$x %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble() %>%
    dplyr::select(sample, PC1, PC2) %>%
    left_join(k12dge$samples) %>%
    ggplot(aes(PC1, rRNA, label = rRNA)) +
    geom_point(aes(colour = group), alpha = 0.8, size = 3) +
    geom_text_repel(show.legend = FALSE) +
    geom_smooth(method = "lm") +
    labs(
        x = paste0("PC1 (", percent(summary(k12pca)$importance[2, "PC1"]), ")"),
        y = "rRNA proportion",
        colour = "Genotype",
        title = "k = 12"
    ) +
    scale_y_continuous(labels = percent) +
    scale_colour_discrete(labels = c("Wildtype", "Mutant"))

Differential expression

k12design <- model.matrix(~rRNA, data = k12dge$samples)
k12voom <- voom(k12dge, design = k12design)
k12fit <- lmFit(k12voom, design = k12design)
k12eBayes <- eBayes(k12fit)
k12topTable <- k12eBayes %>%
    topTable(coef = colnames(k12design)[2], sort.by = "p", n = Inf) %>%
    set_colnames(str_remove(colnames(.), "ID\\.")) %>%
    rownames_to_column("mer") %>%
    mutate(BY = p.adjust(P.Value, "BY")) %>%
    mutate(DE = BY < 0.05) %>%
    dplyr::select(
        mer,
        AveExpr,
        logFC,
        P.Value,
        FDR = adj.P.Val,
        BY,
        t,
        DE,
        everything(),
        -B
    ) %>%
    as_tibble()

Distributions

ggarrange(
    k5dist, k6dist, k7dist, k8dist, k9dist, k10dist, k11dist, k12dist,
    ncol = 2, nrow = 4, common.legend = TRUE, legend = "bottom" 
)
*Distributions of kmer counts for each value of $k$*

Distributions of kmer counts for each value of \(k\)

PCA

ggarrange(
    k5pcaPlot, k6pcaPlot, k7pcaPlot, k8pcaPlot, k9pcaPlot, k10pcaPlot, k11pcaPlot, k12pcaPlot,
    ncol = 2, nrow = 4, common.legend = TRUE, legend = "bottom"
)
*Principal component analysis for all values of $k$. As $k$ increases, WT and mutant groups start to separate.*

Principal component analysis for all values of \(k\). As \(k\) increases, WT and mutant groups start to separate.

ggarrange(
    k5pcaRrna, k6pcaRrna, k7pcaRrna, k8pcaRrna, k9pcaRrna, k10pcaRrna, k11pcaRrna, k12pcaRrna,
    ncol = 2, nrow = 4, common.legend = TRUE, legend = "bottom"
)
*Principal component analysis for all values of $k$. As $k$ increases, WT and mutant groups start to separate.*

Principal component analysis for all values of \(k\). As \(k\) increases, WT and mutant groups start to separate.

Clustering

ggarrange(
    k5den, k6den, k7den, k8den, k9den, k10den, k11den, k12den,
    ncol = 2, nrow = 4, common.legend = TRUE, legend = "bottom"
)
*Hierarchical clustering of samples based on kmer counts. As $k$ increases, WT and mutant groups start to cluster.*

Hierarchical clustering of samples based on kmer counts. As \(k\) increases, WT and mutant groups start to cluster.

Differential expression

topRes <- function(x, cap){
    x %>% 
        dplyr::select(mer, AveExpr, logFC, P.Value, FDR, BY, DE) %>%
        mutate(
            AveExpr = format(round(AveExpr, 2), nsmall = 2),
            logFC = format(round(logFC, 2), nsmall = 2),
            P.Value = sprintf("%.2e", P.Value),
            FDR = sprintf("%.2e", FDR),
            BY = sprintf("%.2e", BY)
        ) %>%
        dplyr::slice(1:100) %>%
        datatable(
            options = list(pageLength = 10),
            class = "striped hover condensed responsive",
            filter = "top",
            caption = cap
        )
}

k = 5

topRes(k5topTable,
       cap = paste(
           "The top 100 differentially expressed 5-mers.",
           nrow(dplyr::filter(k5topTable, DE)),
           "of",
           nrow(k5topTable),
           "detected sequences were classified as DE with BY p-value < 0.05."
       )
)
k5topTable %>%
    ggplot(aes(P.Value)) +
    geom_histogram(
        binwidth = 0.02,
        colour = "black", fill = "grey90"
    ) +
    labs(title = "k = 5")
*Histogram of p-values. Values follow the expected distribution when there are many differences.*

Histogram of p-values. Values follow the expected distribution when there are many differences.

k5topTable %>%
    ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
    geom_point(alpha = 0.5) +
    scale_colour_manual(values = c("black", "grey50", "red")) +
    geom_text_repel(
        data = . %>%
            # dplyr::filter(-log10(P.Value) > 4 | logFC > 4 | logFC < -2),
            dplyr::filter(-log10(P.Value) > 4 | logFC > 3 | logFC < -2.5),
        aes(label = mer, color = "black")
    ) +
    labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 5") +
    theme_bw() +
    theme(legend.position = "none")
*Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.*

Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.


k = 6

topRes(k6topTable,
       cap = paste(
           "The top 100 differentially expressed 6-mers.",
           nrow(dplyr::filter(k6topTable, DE)),
           "of",
           nrow(k6topTable),
           "detected sequences were classified as DE with a BY p-value < 0.05."
       )
)
k6topTable %>%
    ggplot(aes(P.Value)) +
    geom_histogram(
        binwidth = 0.02,
        colour = "black", fill = "grey90"
    ) +
    labs(title = "k = 6")
*Histogram of p-values. Values follow the expected distribution when there are some differences.*

Histogram of p-values. Values follow the expected distribution when there are some differences.

k6topTable %>%
    ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
    geom_point(alpha = 0.5) +
    scale_colour_manual(values = c("black", "grey50", "red")) +
    geom_text_repel(
        data = . %>%
            dplyr::filter(-log10(P.Value) > 6 | logFC > 6 | logFC < -2.3),
        aes(label = mer, color = "black")
    ) +
    labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 6") +
    theme_bw() +
    theme(legend.position = "none")
*Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.*

Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.


k = 7

topRes(k7topTable,
       cap = paste(
           "The top 100 differentially expressed 7-mers.",
           nrow(dplyr::filter(k7topTable, DE)),
           "of",
           nrow(k7topTable),
           "detected sequences were classified as DE with a BY p-value < 0.05."
       )
)
k7topTable %>%
    ggplot(aes(P.Value)) +
    geom_histogram(
        binwidth = 0.02,
        colour = "black", fill = "grey90"
    ) +
    labs(title = "k = 7")
*Histogram of p-values. Values follow the expected distribution when there are many differences.*

Histogram of p-values. Values follow the expected distribution when there are many differences.

k7topTable %>%
    ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
    geom_point(alpha = 0.5) +
    scale_colour_manual(values = c("black", "grey50", "red")) +
    geom_text_repel(
        data = . %>%
            dplyr::filter(-log10(P.Value) > 6.4 | logFC > 10 | logFC < -5),
        aes(label = mer, color = "black")
    ) +
    labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 7") +
    theme_bw() +
    theme(legend.position = "none")
*Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.*

Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.


k = 8

topRes(k8topTable,
       cap = paste(
           "The top 100 differentially expressed 8-mers.",
           nrow(dplyr::filter(k8topTable, DE)),
           "of",
           nrow(k8topTable),
           "detected sequences were classified as DE with a BY p-value < 0.05."
       )
)
k8topTable %>%
    ggplot(aes(P.Value)) +
    geom_histogram(
        binwidth = 0.02,
        colour = "black", fill = "grey90"
    ) +
    labs(title = "k = 8")
*Histogram of p-values. Values follow the expected distribution when there are many differences.*

Histogram of p-values. Values follow the expected distribution when there are many differences.

k8topTable %>%
    ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
    geom_point(alpha = 0.5) +
    scale_colour_manual(values = c("black", "grey50", "red")) +
    geom_text_repel(
        data = . %>%
            dplyr::filter(-log10(P.Value) > 7.2 | logFC > 12.5 | logFC < -8),
        aes(label = mer, color = "black")
    ) +
    labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 8") +
    theme_bw() +
    theme(legend.position = "none")
*Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.*

Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.


k = 9

topRes(k9topTable,
       cap = paste(
           "The top 100 differentially expressed 9-mers.",
           nrow(dplyr::filter(k9topTable, DE)),
           "of",
           nrow(k9topTable),
           "detected sequences were classified as DE with a BY p-value < 0.05."
       )
)
k9topTable %>%
    ggplot(aes(P.Value)) +
    geom_histogram(
        binwidth = 0.02,
        colour = "black", fill = "grey90"
    ) +
    labs(title = "k = 9")
*Histogram of p-values. Values follow the expected distribution when there are many differences.*

Histogram of p-values. Values follow the expected distribution when there are many differences.

k9topTable %>%
    ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
    geom_point(alpha = 0.5) +
    scale_colour_manual(values = c("black", "grey50", "red")) +
    geom_text_repel(
        data = . %>%
            dplyr::filter(DE & -log10(P.Value) > 7.5 | logFC < -10 | logFC > 15),
        aes(label = mer, color = "black")
    ) +
    labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 9") +
    theme_bw() +
    theme(legend.position = "none")
*Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.*

Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.


k = 10

topRes(k10topTable,
       cap = paste(
           "The top 100 differentially expressed 10-mers.",
           nrow(dplyr::filter(k10topTable, DE)),
           "of",
           nrow(k10topTable),
           "detected sequences were classified as DE with a BY p-value < 0.05."
       )
)
k10topTable %>%
    ggplot(aes(P.Value)) +
    geom_histogram(
        binwidth = 0.02,
        colour = "black", fill = "grey90"
    ) +
    labs(title = "k = 10")
*Histogram of p-values. Values follow the expected distribution when there are many differences.*

Histogram of p-values. Values follow the expected distribution when there are many differences.

k10topTable %>%
    ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
    geom_point(alpha = 0.5) +
    scale_colour_manual(values = c("black", "grey50", "red")) +
    geom_text_repel(
        data = . %>%
            dplyr::filter(-log10(P.Value) > 8.5 | logFC > 20 | logFC < -13.5),
        aes(label = mer, color = "black")
    ) +
    labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 10") +
    theme_bw() +
    theme(legend.position = "none")
*Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.*

Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.


k = 11

topRes(k11topTable,
       cap = paste(
           "The top 100 differentially expressed 11-mers.",
           nrow(dplyr::filter(k11topTable, DE)),
           "of",
           nrow(k11topTable),
           "detected sequences were classified as DE with a BY p-value < 0.05."
       )
)
k11topTable %>%
    ggplot(aes(P.Value)) +
    geom_histogram(
        binwidth = 0.02,
        colour = "black", fill = "grey90"
    ) +
    labs(title = "k = 11")
*Histogram of p-values. Values follow the expected distribution when there are many differences.*

Histogram of p-values. Values follow the expected distribution when there are many differences.

k11topTable %>%
    ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
    geom_point(alpha = 0.5) +
    scale_colour_manual(values = c("black", "grey50", "red")) +
    geom_text_repel(
        data = . %>%
            dplyr::filter(-log10(P.Value) > 8.5 | logFC > 20 | logFC < -13.5),
        aes(label = mer, color = "black")
    ) +
    labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 11") +
    theme_bw() +
    theme(legend.position = "none")
*Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.*

Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.


k = 12

topRes(k12topTable,
       cap = paste(
           "The top 100 differentially expressed 12-mers.",
           nrow(dplyr::filter(k12topTable, DE)),
           "of",
           nrow(k12topTable),
           "detected sequences were classified as DE with a BY p-value < 0.05."
       )
)
k12topTable %>%
    ggplot(aes(P.Value)) +
    geom_histogram(
        binwidth = 0.02,
        colour = "black", fill = "grey90"
    ) +
    labs(title = "k = 12")
*Histogram of p-values. Values follow the expected distribution when there are many differences.*

Histogram of p-values. Values follow the expected distribution when there are many differences.

k12topTable %>%
    ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
    geom_point(alpha = 0.5) +
    scale_colour_manual(values = c("black", "grey50", "red")) +
    geom_text_repel(
        data = . %>%
            dplyr::filter(-log10(P.Value) > 8.5 | logFC > 20 | logFC < -13.5),
        aes(label = mer, color = "black")
    ) +
    labs(x = "logFC", y = expression(paste(-log[10], "(p)")), title = "k = 11") +
    theme_bw() +
    theme(legend.position = "none")
*Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.*

Volcano plot showing -log10(p-value) against logFC. Kmers classified as DE with BY p-value < 0.05 are highlighted in red.

Session info

save(
    k5counts,
    k6counts,
    k7counts,
    k8counts,
    k9counts,
    k10counts,
    k5topTable,
    k6topTable,
    k7topTable,
    k8topTable,
    k9topTable,
    k10topTable,
    file = here::here(
        "1_Psen2S4Ter/R/output/1_2_kmer.RData"
    )
)
sessionInfo()
## R version 4.4.2 (2024-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Adelaide
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggdendro_0.2.0   pheatmap_1.0.12  ggrepel_0.9.6    DT_0.33         
##  [5] edgeR_4.4.0      limma_3.62.1     kableExtra_1.4.0 ggpubr_0.6.0    
##  [9] scales_1.3.0     here_1.0.1       pander_0.6.5     magrittr_2.0.3  
## [13] lubridate_1.9.3  forcats_1.0.0    stringr_1.5.1    dplyr_1.1.4     
## [17] purrr_1.0.2      readr_2.1.5      tidyr_1.3.1      tibble_3.2.1    
## [21] ggplot2_3.5.1    tidyverse_2.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1   viridisLite_0.4.2  farver_2.1.2       fastmap_1.2.0     
##  [5] digest_0.6.37      timechange_0.3.0   lifecycle_1.0.4    statmod_1.5.0     
##  [9] compiler_4.4.2     rlang_1.1.4        sass_0.4.9         tools_4.4.2       
## [13] utf8_1.2.4         yaml_2.3.10        knitr_1.49         ggsignif_0.6.4    
## [17] labeling_0.4.3     htmlwidgets_1.6.4  bit_4.5.0          xml2_1.3.6        
## [21] RColorBrewer_1.1-3 abind_1.4-8        withr_3.0.2        grid_4.4.2        
## [25] fansi_1.0.6        colorspace_2.1-1   MASS_7.3-61        cli_3.6.3         
## [29] rmarkdown_2.28     crayon_1.5.3       generics_0.1.3     rstudioapi_0.17.1 
## [33] tzdb_0.4.0         cachem_1.1.0       splines_4.4.2      vctrs_0.6.5       
## [37] Matrix_1.7-1       jsonlite_1.8.9     carData_3.0-5      car_3.1-2         
## [41] hms_1.1.3          bit64_4.5.2        rstatix_0.7.2      systemfonts_1.1.0 
## [45] crosstalk_1.2.1    locfit_1.5-9.10    jquerylib_0.1.4    glue_1.8.0        
## [49] cowplot_1.1.3      stringi_1.8.4      gtable_0.3.6       munsell_0.5.1     
## [53] pillar_1.9.0       htmltools_0.5.8.1  R6_2.5.1           rprojroot_2.0.4   
## [57] vroom_1.6.5        evaluate_1.0.1     lattice_0.22-6     backports_1.5.0   
## [61] broom_1.0.7        bslib_0.8.0        Rcpp_1.0.13-1      svglite_2.1.3     
## [65] gridExtra_2.3      nlme_3.1-166       mgcv_1.9-1         xfun_0.49         
## [69] pkgconfig_2.0.3